# -*-coding:utf8-*-
import requests
from bs4 import BeautifulSoup
import uuid
import time
import downloadimg
import MyFileUtil
import OnlyTieZiAllImgUrl

LinkList = []
# 总共25800条数据 每页50条数据
str_host = 'https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn='
baiduurl = 'https://tieba.baidu.com'


# 获取服务器的页面数据
def get_index(url):
    try:
        respose = requests.get(url)
        respose.encoding = 'utf-8'
        if respose.status_code == 200:
            return respose.text
    except Exception as e:
        print('获取网页数据异常：'+str(e))
        return None


# 获取页面的所有连接
def getPageUrl(text):
    try:
        soup = BeautifulSoup(text, "html.parser")
        pid = soup.findAll('a', {'class': 'j_th_tit'})
        print(len(pid))
        for aa in pid:
            href = aa.get('href')
            if href not in LinkList:
                LinkList.append(href)
            # print(LinkList)
        print(len(LinkList))
    except Exception as e:
        print('error' + str(e))


# 获取帖子里面的图片
def getAllImgUrl():
    # 从文件中获取链接
    # LinkList = MyFileUtil.myred()
    print(LinkList)
    for url in LinkList:
        myLink, title_name = OnlyTieZiAllImgUrl.analyze_page(baiduurl + url)
        print('开始爬取：' + title_name + ' 的图片')
        for img in myLink:
            downloadimg.download_img(img, title_name)
    # print(len(LinkListImg))


# 获取所有页面连接
def getAllPageUrl():
    # getPageUrl(get_index(str_host + str(0)))
    # for i in range(0, 25800, 50):
    for i in range(1000, 2000, 50):
        print('=======================================================================================================')
        print('虚幻虫爬取页数：'+str(i))
        print('=======================================================================================================')
        LinkList.clear()
        getPageUrl(get_index(str_host+str(i)))
        getAllImgUrl()
    # 链接保存到文件
    # MyFileUtil.mywrite(','.join(LinkList))


if __name__ == '__main__':
    getAllPageUrl()

